rm(list = ls())

## run on R version 3.5.1, will probably need R 3.0.2 or higher
library(xts)                            #version 0.11.0
library(forecast)                       #version 8.4
library(strucchange)                    #version 1.5.1
library(dplyr)                          #version 0.7.6
library(plyr)                           #version 1.8.4

setwd("")

## data <- read.table(
##     "./prep/arabic_usgeo_counts_20150901_to_20170213.txt.gz",
##     sep="\t",skip=1,fill=T
## )

## names(data) <- c(
##     "date","userid","user_lang","lang","usa","state",
##     "arabic_name","nonarabic_name","spanish_name","nonspanish_name","missing_coords","tweets"
## )

## data <- subset(
##     data,
##     ## missing_coords=="False"
##     ## &
##     userid %in% subset(
##                       data,
##                       arabic_name=="True" & usa=="True" & state!=""
##                   )$userid
## )

## data$date <- as.POSIXct(as.Date(data$date, "%Y_%m_%d"))
## data$tweets <- as.numeric(data$tweets)
## data$count <- rep(1, nrow(data))
## data$arabic_speaker <- with(data, lang=="ar" | user_lang=="ar")

## users <- aggregate(
##     tweets ~ userid, data = subset(data, date <= "2017-02-01"),
##     FUN = sum
## )
## names(users)[2] <- "total.tweets"



## data <- data %>%
##     select(date, userid, tweets, arabic_speaker, missing_coords)

## save(
##     data,
##     file="./data/arabic_usgeo_counts_20150901_to_20170213.RData"
## )

load(
    file="./data/arabic_usgeo_counts_20150901_to_20170213.RData"
)

data <- subset(
    data,
    missing_coords=="False"
    )

## note: duplicated if use more than one language in a day.
## -- n_distinct is important
agg <- data %>%
    group_by(date) %>%
    dplyr::summarise(count = n_distinct(userid),
              tweets = sum(tweets)
              )
## missing/partial data in collection (creates drop artifact)
agg <- subset(agg, !(as.character(as.Date(date)) %in% c(
    "2016-05-23","2016-05-24","2016-05-25",
    "2016-05-26","2016-05-27","2016-05-28","2016-05-29",
    "2016-05-30","2016-05-31"
    )))

agg$avg.tweets <- with(agg, (tweets / count))

## main result
aggts.noscale <- xts(agg$count, order.by=agg$date)

## this smoother dampens weekly seasonality --
## without it, breakpoints will identify the ends of weekends before events as
## the starts of the drops
fit.noscale <- tbats(
    aggts.noscale, use.box.cox=FALSE, use.trend=F,
    use.damped.trend=T, seasonal.periods=7
)

## for appendix tweet per day comparison
aggts <- xts(scale(agg$count), order.by=agg$date)
aggts.tweets <- xts(scale(agg$avg.tweets), order.by=agg$date)

fit <- tbats(
    aggts, use.box.cox=FALSE, use.trend=F,
    use.damped.trend=T, seasonal.periods=7
)
fit.tweets <- tbats(
    aggts.tweets, use.box.cox=FALSE, use.trend=F,
    use.damped.trend=T, seasonal.periods=7
)

## crimson hexagon muslim ban mentions
trend <- read.csv(
    "./data/Muslim ban volume trend from 2015-03-07 to 2017-03-31.csv"
)
trend$Date <- as.Date(trend$Date, "%m/%d/%y")
trend$log.posts <- with(trend, log(Total.Posts+1))

pdf("./figs/arabic_name_precise_geotag.pdf", width=10, height=5)
## pdf plot 2
## result
plot(
    index(aggts.noscale),
    aggts.noscale,
    type="l", bty="n",
    xlab="Date", ylab="Number of geo-tagged users", main="",
    col="purple"
)
smooth.mean <- fit.noscale$x[1,]
lines(index(aggts.noscale), smooth.mean, lwd=3, col="purple")

## breakpoints
abline(
    ## 2 breaks
    v=index(aggts.noscale)[breakpoints(smooth.mean~1,breaks=2)$breakpoints],
    lwd=3, lty=2
)
mtext(
    text=as.Date(
        index(aggts.noscale)[breakpoints(smooth.mean~1,breaks=2)$breakpoints]
    ),
    at=index(aggts.noscale)[breakpoints(smooth.mean~1, breaks=2)$breakpoints])
mtext(
    ## 5 breaks
    text=as.Date(
        index(aggts.noscale)[breakpoints(smooth.mean~1)$breakpoints]
    )[-c(1,5)],
    at=index(aggts.noscale)[breakpoints(smooth.mean~1)$breakpoints][-c(1,5)],
    cex=0.75
)

## regression lines
prex <- mean(aggts.noscale[index(aggts.noscale) < "2015-12-07"])
## pre speech
prex <- predict(
    lm(smooth.mean[index(aggts.noscale) < "2015-12-07"] ~
           as.numeric(index(aggts.noscale)[index(aggts.noscale) < "2015-12-07"])
       )
)
segments(
    x0=as.POSIXct("2015-09-01"), x1=as.POSIXct("2015-12-05"),
    y0=prex[1], y1=rev(prex)[1],
    lwd=3
)
## speech to election
midx <- predict(
    lm(smooth.mean[index(aggts.noscale) >= "2015-12-07"
                   & index(aggts.noscale) < "2016-11-08"] ~
           as.numeric(
               index(aggts.noscale)[index(aggts.noscale) >= "2015-12-07"
                                    & index(aggts.noscale) < "2016-11-08"]
           )
       )
)
segments(
    x0=as.POSIXct("2015-12-07"), x1=as.POSIXct("2016-11-06"),
    y0=midx[1], y1=rev(midx)[1],
    lwd=3
)
## post election
postx <- predict(
    lm(smooth.mean[index(aggts.noscale) >= "2016-11-08"] ~
           as.numeric(
               index(aggts.noscale)[index(aggts.noscale) >= "2016-11-08"]
           )
       )
)
segments(
    x0=as.POSIXct("2016-11-08"), x1=as.POSIXct("2017-02-01"),
    y0=postx[1], y1=rev(postx)[1], lwd=3
)

## muslim ban mentions
with(trend, lines(as.POSIXct(Date), scale(log.posts)*20 + 100, col="red"))

## annotation
abline(v=as.POSIXct("2015-12-07"), col="red", lwd=3)
abline(v=as.POSIXct("2016-11-08"), col="red", lwd=3)
abline(v=as.POSIXct("2017-01-27"), col="red", lwd=3)
legend(
    as.POSIXct("2016-01-01"), 170,
    legend=c(
        "'muslim' AND 'ban' (logged scaled, all Twitter mentions)",
        "geo-tagged users with Arabic names"
    ),
    bty="n", col=c("red","purple"), lwd=2, cex=0.8
)


## pdf plot 2
## news transcript muslim mentions
load(
    "./data/transcript_count_muslim_mentions_by_date.RData"
)

count.muslim.bydate <- data.frame(count.muslim.bydate)
names(count.muslim.bydate) <- c("date","n")
count.muslim.bydate$date <- as.Date(count.muslim.bydate$date, "%B %d %Y")
count.muslim.bydate <- count.muslim.bydate[order(count.muslim.bydate$date),]

## compare from twitter muslim ban mentions, overall transcripts muslim mentions
with(
    count.muslim.bydate,
    plot(
        as.POSIXct(date), scale(n),
        type="l", col="black", ylim=c(-3, 5),
        xlim=as.POSIXct(c("2015-04-01", "2016-12-10")),
        bty="n", xlab="Date", ylab="Mentions (sd)",
        lwd=2
    )
)
with(
    trend,
    lines(
        as.POSIXct(Date), scale(log.posts),
        col="red", type="l", lwd=2
    )
)
legend(
    "bottomright",
    legend=c(
        "'muslim' AND 'ban' (Twitter, logged)",
        "ANY 'Muslim' mention (TV news scripts)"
    ),
    bty="n", col=c("red","black"), lwd=2, cex=0.8
)

dev.off()


pdf("./figs/arabic_name_precise_geotag_with_tweets.pdf", width=5, height=5)
plot(
    index(aggts), aggts,
    type="l", bty="n", xlab="Date",
    ylab="Geo-tagged users/tweets (scaled to sd)", main="",
    lwd=0,
    xlim=c(as.POSIXct("2016-07-01"),max(index(aggts))), ylim=c(-2.5, 4.5)
)
lines(index(aggts), aggts, col="purple")
lines(index(aggts), aggts.tweets, col="orange")
smooth.mean <- fit$x[1,]        #prior version was combination of this and loess
lines(index(aggts), smooth.mean, lwd=3, col="purple")
smooth.mean.tweets <- fit.tweets$x[1,]
lines(index(aggts), smooth.mean.tweets, lwd=3, col="orange")

## breakpoints
abline(
    v=index(aggts)[breakpoints(smooth.mean~1,breaks=2)$breakpoints],
    lwd=3, lty=2
)
mtext(
    text=as.Date(index(aggts)[breakpoints(smooth.mean~1,breaks=2)$breakpoints]),
    at=index(aggts)[breakpoints(smooth.mean~1, breaks=2)$breakpoints]
)

## regression lines
prex <- mean(aggts[index(aggts) < "2015-12-07"])
##
prex <- predict(
    lm(smooth.mean[index(aggts) < "2015-12-07"] ~
           as.numeric(index(aggts)[index(aggts) < "2015-12-07"])
       )
)
segments(
    x0=as.POSIXct("2015-09-01"), x1=as.POSIXct("2015-12-05"),
    y0=prex[1], y1=rev(prex)[1], lwd=3
)
##
midx <- predict(
    lm(smooth.mean[index(aggts) >= "2015-12-07" & index(aggts) < "2016-11-08"] ~
           as.numeric(index(aggts)[index(aggts) >= "2015-12-07"
                                   & index(aggts) < "2016-11-08"])
       )
)
segments(
    x0=as.POSIXct("2015-12-07"), x1=as.POSIXct("2016-11-06"),
    y0=midx[1], y1=rev(midx)[1], lwd=3
)
##
postx <- predict(
    lm(smooth.mean[index(aggts) >= "2016-11-08"] ~
           as.numeric(index(aggts)[index(aggts) >= "2016-11-08"])
       )
)
segments(
    x0=as.POSIXct("2016-11-08"), x1=as.POSIXct("2017-02-01"),
    y0=postx[1], y1=rev(postx)[1], lwd=3, cex=0.8
)

## annotation
abline(v=as.POSIXct("2015-12-07"), col="red", lwd=3)
abline(v=as.POSIXct("2016-11-08"), col="red", lwd=3)
abline(v=as.POSIXct("2017-01-27"), lwd=3, col="red")
legend(
    "topleft",
    legend=c(
        "\ngeo-tagged tweets\n(by users with Arabic names)",
        "\ngeo-tagged users\n(with Arabic names)"
    ),
    bty="n", col=c("orange","purple"), lwd=2, cex=0.75
)
dev.off()
















## data <- read.table(
##     "./prep/arabic_usgeo_counts_20150901_to_20170213.txt.gz",
##     sep="\t",skip=1,fill=T
## )

## names(data) <- c(
##     "date","userid","user_lang","lang","usa","state",
##     "arabic_name","nonarabic_name","spanish_name","nonspanish_name","missing_coords","tweets"
## )

## data$date <- as.POSIXct(as.Date(data$date, "%Y_%m_%d"))
## data$tweets <- as.numeric(data$tweets)
## data$count <- rep(1, nrow(data))

load(
    file="./data/arabic_usgeo_counts_20150901_to_20170213.RData"
)

data <- subset(
    data,
    missing_coords=="True"
    )

## agg <- subset(
##     data,
##     missing_coords=="True"
##     & userid %in% subset(data, arabic_name=="True" & usa=="True" & state!="")$userid) %>%
##     group_by(date) %>%
##     dplyr::summarise(count = n_distinct(userid), tweets = sum(tweets))

agg <- data %>%
    group_by(date) %>%
    dplyr::summarise(count = n_distinct(userid), tweets = sum(tweets))

data <- data %>%
    select(date, userid, tweets)


agg <- subset(agg, !(as.character(as.Date(date)) %in% c(
    "2016-05-23","2016-05-24","2016-05-25",
    "2016-05-26","2016-05-27","2016-05-28","2016-05-29",
    "2016-05-30","2016-05-31"
    )))

aggts <- xts(scale(agg$count), order.by=agg$date)
aggts.noscale <- xts(agg$count, order.by=agg$date)

agg$avg.tweets <- with(agg, (tweets / count))
aggts.tweets <- xts(scale(agg$avg.tweets), order.by=agg$date)



fit <- tbats(aggts, use.box.cox=FALSE, use.trend=F, use.damped.trend=T, seasonal.periods=7)

fit.noscale <- tbats(aggts.noscale, use.box.cox=FALSE, use.trend=F, use.damped.trend=T, seasonal.periods=7)

fit.tweets <- tbats(aggts.tweets, use.box.cox=FALSE, use.trend=F, use.damped.trend=T, seasonal.periods=7)

pdf("./figs/arabic_name_coarse_geotag.pdf", width=5, height=5)

plot(
    index(aggts), aggts,
    type="l", bty="n",
    xlab="Date", ylab="Geo-tagged users/tweets (scaled to sd)", lwd=0,
    xlim=c(as.POSIXct("2016-07-01"),max(index(aggts))), ylim=c(-2.5, 4.5),
    main="Coarse (e.g. state-level) geotags"
)
lines(index(aggts), aggts, col="purple")
lines(index(aggts), aggts.tweets, col="orange")
abline(v=as.POSIXct("2015-12-07"), col="red", lwd=3)
abline(v=as.POSIXct("2016-11-08"), col="red", lwd=3)
## ## from precise geotag breakpoints
## abline(
##     v=index(aggts)[breakpoints(smooth.mean~1,breaks=2)$breakpoints], lwd=3, lty=2
## )
## mtext(
##     text=as.Date(index(aggts)[breakpoints(smooth.mean~1,breaks=2)$breakpoints]),
##     at=index(aggts)[breakpoints(smooth.mean~1, breaks=2)$breakpoints]
## )
abline(v=as.POSIXct("2017-01-27"), lwd=3, col="red")
legend(
    "topleft",
    legend=c(
        "\ngeo-tagged tweets\n(by users with Arabic names)",
        "\ngeo-tagged users\n(with Arabic names)"), bty="n",
    col=c("orange","purple"), lwd=2, cex=0.75
)


plot(
    index(aggts), aggts, type="l", bty="n",
    xlab="Date", ylab="Geo-tagged users/tweets (scaled to sd)", lwd=0,
    xlim=c(as.POSIXct("2015-09-01"),max(index(aggts))), ylim=c(-2.5, 4.5),
    main="Coarse (e.g. state-level) geotags"
)
lines(index(aggts), aggts, col="purple")
lines(index(aggts), aggts.tweets, col="orange")
abline(v=as.POSIXct("2015-12-07"), col="red", lwd=3)
abline(v=as.POSIXct("2016-11-08"), col="red", lwd=3)
## abline(
##     v=index(aggts)[breakpoints(smooth.mean~1,breaks=2)$breakpoints], lwd=3, lty=2
## )
## mtext(
##     text=as.Date(index(aggts)[breakpoints(smooth.mean~1,breaks=2)$breakpoints]),
##     at=index(aggts)[breakpoints(smooth.mean~1, breaks=2)$breakpoints]
## )
abline(v=as.POSIXct("2017-01-27"), lwd=3, col="red")
legend(
    "topleft",
    legend=c(
        "\ngeo-tagged tweets\n(by users with Arabic names)",
        "\ngeo-tagged users\n(with Arabic names)"
    ), bty="n", col=c("orange","purple"), lwd=2, cex=0.75
)

dev.off()
